package com.zhny.utils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

/**
 * 百度百科爬虫搜索
 */
public class BaiDuScrapeUtils {

    public static String baike(String name) {
        System.out.println(name);
        // Scanner input = new Scanner(System.in);
        String baseUrl = "https://baike.baidu.com/item/";
        // String url = "";

        // 如果输入文字不是"exit"，则爬取其百度百科的介绍部分，否则退出该程序
        /*while (true) {
            System.out.println("Enter the word(Enter 'exit' to exit):");
            url = input.nextLine();
            if (url.equals("exit")) {
                System.out.println("The program is over.");
                break;
            }*/
        String introduction = getContent(baseUrl + name);

        System.out.println(introduction + '\n');
        return introduction;
        //}
    }

    // getContent()函数主要实现爬取输入文字的百度百科的介绍部分
    public static String getContent(String url) {
        // 利用URL解析网址
        URL urlObj = null;
        try {
            urlObj = new URL(url);

        } catch (MalformedURLException e) {
            System.out.println("The url was malformed!");
            return "";
        }

        // URL连接
        URLConnection urlCon = null;
        try {
            urlCon = urlObj.openConnection(); // 打开URL连接
            // 将HTML内容解析成UTF-8格式
            Document doc = Jsoup.parse(urlCon.getInputStream(), "utf-8", url);
            // 刷选需要的网页内容

            String contentText = doc.select("div.lemma-summary").first().text();
            //String contentText = doc.select("main-content").first().text();

            //Elements select = doc.select("div.para");
            //System.out.println(select.text());
            // 利用正则表达式去掉字符串中的"[数字]"
            contentText = contentText.replaceAll("\\[\\d+\\]", "");
            return contentText;

        } catch (IOException e) {
            System.out.println("There was an error connecting to the URL");
            return "";
        }

    }

}
